import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns #for plotting
from sklearn.ensemble import RandomForestClassifier #for the model
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz #plot tree
from sklearn.metrics import roc_curve, auc #for model evaluation
from sklearn.metrics import classification_report #for model evaluation
from sklearn.metrics import confusion_matrix #for model evaluation
from sklearn.model_selection import train_test_split #for data splitting
import eli5 #for purmutation importance
from eli5.sklearn import PermutationImportance
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score,roc_auc_score
from xgboost import XGBClassifier, plot_importance
from pdpbox import pdp, get_dataset, info_plots
# model explainers
import lime
from lime.lime_tabular import LimeTabularExplainer
import shap
from shap import TreeExplainer,KernelExplainer,LinearExplainer
shap.initjs()
import eli5
import shap
pip install PDPbox
np.random.seed(123) #ensure reproducibility
pd.options.mode.chained_assignment = None #hide any pandas warnings
from pdpbox import pdp, info_plots #for partial plots
data = pd.read_csv("/Users/odianosenakhibi/Downloads/Thesis/data/heart.csv")
data.head(10)
data.describe()
data.info()
data.dtypes
plt.figure(figsize=(10,8))
sns.heatmap(data.isnull().T, cbar=False)
data = pd.get_dummies(data, drop_first=True)
data.head()
sns.countplot(x='target',data=data)
sns.pairplot(data)
plt.show()
# Show the results of a linear regression within each dataset
sns.lmplot(x="trestbps", y="chol",data=data,hue="cp")
plt.show()
sns.violinplot(data.age, palette="Set3", bw = .2, cut = 1, linewidth = 1)
plt.xticks(rotation = 90)
plt.title("Age Rates")
plt.show()
plt.figure(figsize=(15,7))
sns.violinplot(x=data.age,y=data.target)
plt.xticks(rotation=90)
plt.legend()
plt.title("Age & Target System")
plt.show()
#split the data
X_train, X_test, y_train, y_test = train_test_split(data.drop('target', 1), data['target'], test_size = .2, random_state=10)
#Let's see how the correlation values between them
data.corr()
model = RandomForestClassifier(max_depth = 5)
model.fit(X_train, y_train)
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('y_train',y_train.shape)
print('y_test',y_test.shape)
#Normalization
X_train=(X_train-np.min(X_train))/(np.max(X_train)-np.min(X_train)).values
X_test=(X_test-np.min(X_test))/(np.max(X_test)-np.min(X_test)).values
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV,train_test_split,cross_val_score
import os
from sklearn.decomposition import PCA
pca=PCA().fit(X_train)
print(pca.explained_variance_ratio_)
print()
print(X_train.columns.values.tolist())
print(pca.components_)
#To reduce the data dimensions and noise in the input data
pca = PCA(n_components=8)
pca.fit(X_train)
reduced_data_train = pca.transform(X_train)
#inverse_data = pca.inverse_transform(reduced_data)
plt.scatter(reduced_data_train[:, 0], reduced_data_train[:, 1],
label='reduced')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()
pca = PCA(n_components=8)
pca.fit(X_test)
reduced_data_test = pca.transform(X_test)
#inverse_data = pca.inverse_transform(reduced_data)
plt.scatter(reduced_data_test[:, 0], reduced_data_test[:, 1],
label='reduced')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()
estimator = model.estimators_[1]
feature_names = [i for i in X_train.columns]
y_train_str = y_train.astype('str')
y_train_str[y_train_str == '0'] = 'no disease'
y_train_str[y_train_str == '1'] = 'disease'
y_train_str = y_train_str.values
export_graphviz(estimator, out_file='tree.dot',
feature_names = feature_names,
class_names = y_train_str,
rounded = True, proportion = True,
label='root',
precision = 2, filled = True)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
from IPython.display import Image
Image(filename = 'tree.png')
#to capture the interaction between the features of the data
y_predict = model.predict(X_test)
y_pred_quant = model.predict_proba(X_test)[:, 1]
y_pred_bin = model.predict(X_test)
confusion_matrix = confusion_matrix(y_test, y_pred_bin)
confusion_matrix
total = sum(sum(confusion_matrix))
sensitivity = confusion_matrix[0, 0] / (confusion_matrix[0, 0] + confusion_matrix[1, 0])
print('Sensitivity : ', sensitivity)
specificity = confusion_matrix[1, 1] / (confusion_matrix[1, 1] + confusion_matrix[0, 1])
print('Specificity : ', specificity)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_quant)
fig, ax = plt.subplots()
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title('ROC curve for Heart Disease classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)
auc(fpr, tpr)
import seaborn as sns
from xgboost import XGBClassifier, plot_importance
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
%matplotlib inline
# for visualizing correlations
f, ax = plt.subplots(figsize=(10, 6))
corr = data.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="Reds",fmt='.2f',
linewidths=.05)
f.subplots_adjust(top=0.93)
t= f.suptitle('Heart Disease Attributes Correlation Heatmap', fontsize=14)
target = 'target'
features_list = list(data.columns)
features_list.remove(target)
sns.set()
sns.relplot(data = data, x = 'cp', y = target, kind = 'line', height = 5, aspect = 2, color = 'red')
%%time
# ML in two lines ;)
xgb = XGBClassifier(objective='binary:logistic', random_state=33, n_jobs=-1)
xgb.fit(X_train, y_train)
xgb = XGBClassifier(objective='binary:logistic', random_state=33, n_jobs=-1)
xgb.fit(X_train, y_train)
xgb_predictions = xgb.predict(X_test)
# We design a simple classification evaluative function
def evaluation_scores(test, prediction, target_names=None):
print('Accuracy:', np.round(metrics.accuracy_score(test, prediction), 10))
print('-'*60)
print('classification report:\n\n', metrics.classification_report(y_true=test, y_pred=prediction, target_names=target_names))
classes = [0, 1]
total_classes = len(classes)
level_labels = [total_classes*[0], list(range(total_classes))]
cm = metrics.confusion_matrix(y_true=test, y_pred=prediction, labels=classes)
cm_frame = pd.DataFrame(data=cm, columns=pd.MultiIndex(levels=[['Predicted:'], classes], labels=level_labels),
index=pd.MultiIndex(levels=[['Actual:'], classes], labels=level_labels))
print('-'*60)
print('Confusion matrix:\n')
print(cm_frame)
from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_curve, auc
evaluation_scores(y_test, xgb_predictions, target_names=['no disease', 'disease'])
#Showing the main classification metrics, (precision, recall and f1 score)
# calculate the FPR and TPR for all thresholds of the classification
probs = xgb.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
#AUC-ROC measures the performance of the classification problem at various thresholds with the ROC curve telling us the ability of the model to distinguish between classes
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'red', label = 'ROC AUC score = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'b--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
auc(fpr,tpr)
# ploting XGBoost default feature importances
fig = plt.figure(figsize = (18, 10))
title = fig.suptitle("Native Feature Importances from XGBoost", fontsize=14)
ax1 = fig.add_subplot(2, 2, 1)
plot_importance(xgb, importance_type='weight', ax=ax1, color='red')
ax1.set_title("Feature Importance with Feature Weight");
ax2 = fig.add_subplot(2, 2, 2)
plot_importance(xgb, importance_type='cover', ax=ax2, color='red')
ax2.set_title("Feature Importance with Sample Coverage");
ax3 = fig.add_subplot(2, 2, 3)
plot_importance(xgb, importance_type='gain', ax=ax3, color='red')
ax3.set_title("Feature Importance with Split Mean Gain");
import eli5
from eli5.sklearn import PermutationImportance
eli5.show_weights(xgb.get_booster())
#this gives us an idea of how the model performs globally
%%time
# we need to retrain a new model with arrays
# as eli5 has a bug with Dataframes and XGBoost
# cf. https://github.com/TeamHG-Memex/eli5/pull/261
xgb_array = XGBClassifier(objective='binary:logistic', random_state=33, n_jobs=-1)
xgb_array.fit(X_train, y_train)
xgb_array = XGBClassifier(objective='binary:logistic', random_state=33, n_jobs=-1)
xgb_array.fit(X_train, y_train)
feat_permut = PermutationImportance(xgb_array,
random_state=1).fit(X_train, y_train)
eli5.show_weights(feat_permut, feature_names = features_list)
from pdpbox import pdp, get_dataset, info_plots
def plot_pdp(model, df, feature, cluster_flag=False, nb_clusters=None, lines_flag=False):
# Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=model, dataset=data, model_features=df.columns.tolist(), feature=feature)
# plot it
pdp.pdp_plot(pdp_goals, feature, cluster=cluster_flag, n_cluster_centers=nb_clusters, plot_lines=lines_flag)
plt.show()
# plot the PD univariate plot
plot_pdp(xgb, X_train, 'cp')
# for ICE plot we must specify the numbers of similarity clusters we want
# here 24
plot_pdp(xgb, X_train, 'cp', cluster_flag=True, nb_clusters=24, lines_flag=True)
features_to_plot = ['thalach', 'age']
inter1 = pdp.pdp_interact(model=xgb, dataset=X_train, model_features=features_list, features=features_to_plot)
pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='grid')
# we use plot_type='grid' as the default and better option 'contour' has a bug which is being corrected
# cf. https://github.com/SauceCat/PDPbox/issues/40
plt.show()
cat_cols = list(data.select_dtypes('object').columns)
class_dict = {}
for col in cat_cols:
data = pd.concat([data.drop(col, axis=1), pd.get_dummies(data[col])], axis=1)
data.head()
ML_models = {}
model_index = ['LR','RF','NN']
model_sklearn = [LogisticRegression(solver='liblinear',random_state=0),
RandomForestClassifier(n_estimators=100,random_state=0),
MLPClassifier([100]*5,early_stopping=True,learning_rate='adaptive',random_state=0)]
model_summary = []
for name,model in zip(model_index,model_sklearn):
ML_models[name] = model.fit(X_train,y_train)
preds = model.predict(X_test)
model_summary.append([name,f1_score(y_test,preds,average='weighted'),accuracy_score(y_test,preds),
roc_auc_score(y_test,model.predict_proba(X_test)[:,1])])
ML_models
model_summary = pd.DataFrame(model_summary,columns=['Name','F1_score','Accuracy','AUC_ROC'])
model_summary = model_summary.reset_index()
display(model_summary)
g=sns.regplot(data=model_summary, x="index", y="AUC_ROC", fit_reg=False,
marker="o", color="black", scatter_kws={'s':500})
for i in range(0,model_summary.shape[0]):
g.text(model_summary.loc[i,'index'], model_summary.loc[i,'AUC_ROC']+0.02, model_summary.loc[i,'Name'],
horizontalalignment='center',verticalalignment='top', size='large', color='black')
#initialization of an explainer from LIME
explainer = LimeTabularExplainer(X_train.values,
mode='classification',
feature_names=X_train.columns,
class_names=['disease','no disease'])
exp = explainer.explain_instance(X_test.head(1).values[0],
ML_models['LR'].predict_proba,
num_features=X_train.shape[1])
exp.show_in_notebook(show_table=True, show_all=True)
exp = explainer.explain_instance(X_test.head(1).values[0],
ML_models['RF'].predict_proba,
num_features=X_train.shape[1])
exp.show_in_notebook(show_table=True, show_all=False)
exp = explainer.explain_instance(X_test.head(1).values[0],
ML_models['NN'].predict_proba,
num_features=X_train.shape[1])
exp.show_in_notebook(show_table=True, show_all=False)
#ELI5
eli5.show_weights(ML_models['LR'], feature_names = list(X_test.columns))
eli5.show_prediction(ML_models['LR'], X_test.head(1).values[0],feature_names=list(X_test.columns))
exp = PermutationImportance(ML_models['LR'],
random_state = 0).fit(X_test, y_test)
eli5.show_weights(exp,feature_names=list(X_test.columns))
eli5.show_weights(ML_models['RF'],feature_names=list(X_test.columns))
eli5.show_prediction(ML_models['RF'], X_test.head(1).values[0],feature_names=list(X_test.columns))
exp = PermutationImportance(ML_models['RF'],
random_state = 0).fit(X_test, y_test)
eli5.show_weights(exp,feature_names=list(X_test.columns))
eli5.show_weights(ML_models['NN'])
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test)
X_shap = pd.DataFrame(shap_values)
X_shap.tail()
print('Expected Value: ', explainer.expected_value)
shap.summary_plot(shap_values, X_test, plot_type="bar", color= 'red')
#SHAP
explainer = LinearExplainer(ML_models['LR'], X_train, feature_dependence="independent")
shap_values = explainer.shap_values(X_test.head(1).values)
shap.force_plot(explainer.expected_value,
shap_values,
X_test.head(1).values,
feature_names=X_test.columns)
shap_values = explainer.shap_values(X_test.head(250).values)
shap.force_plot(explainer.expected_value,
shap_values,
X_test.head(250).values,
feature_names=X_test.columns)
shap_values = explainer.shap_values(X_test.values)
spplot = shap.summary_plot(shap_values, X_test.values, feature_names=X_test.columns)
top4_cols = ['ca','cp','thalach','sex']
for col in top4_cols:
shap.dependence_plot(col, shap_values, X_test)
explainer = TreeExplainer(ML_models['RF'])
shap_values = explainer.shap_values(X_test.head(1).values)
shap.force_plot(explainer.expected_value[1],
shap_values[1],
X_test.head(1).values,
feature_names=X_test.columns)
X_train_kmeans = shap.kmeans(X_train, 10)
explainer = KernelExplainer(ML_models['NN'].predict_proba,X_train_kmeans)
shap_values = explainer.shap_values(X_test.head(1).values)
shap.force_plot(explainer.expected_value[1],
shap_values[1],
X_test.head(1).values,
feature_names=X_test.columns)